library(tidyverse)
library(ggridges)
library(lubridate)
library(scales)
library(corrplot)
library(factoextra)
library(roll)
library(RolWinMulCor)
library(BINCOR)
library(corrr)
dat = read_csv("../Data/final_combined_dates_filled_v1.csv")
dat$sampledate = as.Date(paste0(dat$year-1, "-12-31")) + dat$daynum_fill

Data Info / Status

dat %>% 
  mutate(lakeid = factor(lakeid, levels =  c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"))) %>%
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

How variables were filled

  • iceoff/iceon: FI filled with MO; northern lakes filled w/ lake-specific model of other northern lake ice on or off dates
  • doc: filled with lake-specific model (daynum_doc ~ daynum_all_other_variables)
  • chl: currently filled with lake specific modeled time series (chl ~ wtemp + DOsat), which are then used to determine peaks
  • total zoop and daphnia biomass: filled with lake-specific median daynum

Distribution of peak dates

lakes_order = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI")

vars_order = c("iceoff", "straton", "chlor_spring", "secchi_openwater", "daphnia_biomass", "doc", "chlor_all", "anoxia_summer", "stability", "energy", "chlor_fall", "stratoff", "iceon")

dat %>% 
  filter(metric %in% vars_order) %>% 
  mutate(lakeid = factor(lakeid, levels = lakes_order),
         metric = factor(metric, levels = rev(vars_order), labels = rev(c("iceoff", "straton", "spring bloom", "clearwater", "daphnia", "doc",  "peak chl", "anoxia summer", "stability", "energy", "fall bloom", "stratoff", "iceon")))) %>% 
  ggplot() + 
  stat_density_ridges(aes(x = as.Date(daynum, origin = as.Date('2019-01-01')), 
                          y= metric, col = metric, fill = metric), 
                      alpha = 0.5, quantile_lines = T, quantiles = 2) +
  scale_x_date(labels = date_format("%b")) +
  facet_wrap(~ (lakeid)) +
  xlab('') + ylab('Density')+
  theme_minimal() + 
  theme(axis.text=element_text(size=8))

Analyses

Eigenvalue Analysis

source("../src/timing_matrix_construction.R")

vars_order_eigen =  c("iceoff", "straton", "secchi_openwater", "daphnia_biomass", "doc", "chlor_spring", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
all_var_combos = expand.grid(vars_order_eigen, vars_order_eigen)
all_lake_years = dat %>% 
  filter(metric %in% vars_order_eigen) %>% 
  select(lakeid, year) %>% 
  distinct() %>% 
  rename(id = lakeid)

all_lyv = expand_grid(all_var_combos, all_lake_years)

all_lyv_1 = dat %>% 
  filter(metric %in% vars_order_eigen) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  rename(Var1 = metric, id=lakeid, value=daynum_fill) %>% 
  right_join(all_lyv) %>% 
  rename(Value1 = value)

all_lyv_2 = dat %>% 
  filter(metric %in% vars_order) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  rename(Var2 = metric, id=lakeid, value=daynum_fill) %>%
  right_join(all_lyv_1) %>% 
  rename(Value2 = value) %>% 
  select(id, year, Var1, Value1, Var2, Value2) %>% 
  mutate(doy_diff = Value2 - Value1)

lake_years = dat %>% 
  select(lakeid, year) %>% 
  distinct()

lake_years$eigen = as.numeric(NA)

lakes = unique(lake_years$lakeid)
for(l in lakes){
  hold_mats = matrix_single_lake(timings.df = all_lyv_2, single_lake = l)
  for(i in 1:length(dimnames(hold_mats)[[3]])){
    cur_year = dimnames(hold_mats)[[3]][i]
    cur_ind = lake_years$lakeid == l & lake_years$year == as.numeric(cur_year)
    x = abs(hold_mats[,,i])
    error <- try(eigen(x), silent = T)
    if (class(error) != "try-error"){
      lake_years[cur_ind, "eigen"] = max(error$values)
    }
  }
}

ggplot(lake_years %>% arrange(year)) +
  geom_point(aes(year, eigen, col=lakeid, group=lakeid)) +
  geom_line(aes(year, eigen, col=lakeid, group=lakeid)) +
  scale_color_brewer(palette = "Paired") +
  theme_bw()

Correlation of eigenvalues

eigen_corrs = lake_years %>% 
  pivot_wider(names_from = lakeid, values_from = eigen) %>% 
  select(-year) %>%
  cor(use="complete.obs") 
# diag(eigen_corrs) = 0

eigen_corrs %>% 
  corrplot(method="circle")

PCA

Setup select lakes and variables to analyze:

var_pca = c("iceoff", "straton", "secchi_openwater", "dapnia_biomass", "doc", "chlor_spring", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
lakeid_pca = c("FI", "ME", "MO", "WI", "AL", "BM", "CB", "CR", "SP", "TB", "TR")

dat_pca = dat %>% 
  filter(metric %in% var_pca & lakeid %in% lakeid_pca) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = "metric", values_from="daynum_fill")

Do the PCA, plot results:

pca = prcomp(dat_pca[, 3:ncol(dat_pca)], scale=T, center=T)

fviz_eig(pca)

fviz_pca_var(pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

Principal components correlation analysis

var = get_pca_var(pca)
inds = get_pca_ind(pca)

pca_res = dat_pca[, c("lakeid", "year")]
pca_res$PC1 = pca$x[,1]
pca_res$PC2 = pca$x[,2]
pca_res$PC3 = pca$x[,3]
pca_res$PC4 = pca$x[,4]

pca_res_cor = pca_res %>% 
  pivot_longer(cols = c("PC1", "PC2", "PC3", "PC4")) %>% 
  pivot_wider(names_from = "lakeid", values_from="value") %>% 
  arrange(name, year)
  

pca_matrix_p1 = pca_res_cor %>% 
  filter(name == "PC1") %>% 
  select(-year) %>% 
  summarise(as.data.frame(cor(.[,lakeid_pca], use="complete.obs"))) %>% 
  as.matrix() 
rownames(pca_matrix_p1) = lakeid_pca
# diag(pca_matrix) = 0

pca_matrix_p2 = pca_res_cor %>% 
  filter(name == "PC2") %>% 
  select(-year) %>% 
  summarise(as.data.frame(cor(.[,lakeid_pca], use="complete.obs"))) %>% 
  as.matrix() 
rownames(pca_matrix_p2) = lakeid_pca
# diag(pca_matrix) = 0

Plot first two PC correlations

# dev.off()
par(mfrow=c(1,2))
corrplot(pca_matrix_p1, method="circle", title="PC1", mar=c(0,0,1,0))
corrplot(pca_matrix_p2, method="circle", title = "PC2", mar=c(0,0,1,0))

PCA without ice dates

var_pca_noice = c("straton", "secchi_openwater", "daphnia_biomass", "doc", "chlor_spring", "anoxia_summer", "stability", "energy", "stratoff")
lakeid_pca = c("FI", "ME", "MO", "WI", "AL", "BM", "CB", "CR", "SP", "TB", "TR")

dat_pca_noice = dat %>% 
  filter(metric %in% var_pca_noice & lakeid %in% lakeid_pca) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = "metric", values_from="daynum_fill")

pca_noice = prcomp(dat_pca_noice[, 3:ncol(dat_pca_noice)], scale=T, center=T)

fviz_eig(pca_noice)

fviz_pca_var(pca_noice,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

var = get_pca_var(pca_noice)
inds = get_pca_ind(pca_noice)

pca_res_noice = dat_pca_noice[, c("lakeid", "year")]
pca_res_noice$PC1 = pca_noice$x[,1]
pca_res_noice$PC2 = pca_noice$x[,2]
pca_res_noice$PC3 = pca_noice$x[,3]
pca_res_noice$PC4 = pca_noice$x[,4]

pca_res_cor_noice = pca_res_noice %>% 
  pivot_longer(cols = c("PC1", "PC2", "PC3", "PC4")) %>% 
  pivot_wider(names_from = "lakeid", values_from="value") %>% 
  arrange(name, year)
  

pca_matrix_p1_noice = pca_res_cor_noice %>% 
  filter(name == "PC1") %>% 
  select(-year) %>% 
  summarise(as.data.frame(cor(.[,lakeid_pca], use="complete.obs"))) %>% 
  as.matrix() 
rownames(pca_matrix_p1_noice) = lakeid_pca
# diag(pca_matrix) = 0

pca_matrix_p2_noice = pca_res_cor_noice %>% 
  filter(name == "PC2") %>% 
  select(-year) %>% 
  summarise(as.data.frame(cor(.[,lakeid_pca], use="complete.obs"))) %>% 
  as.matrix() 
rownames(pca_matrix_p2_noice) = lakeid_pca

par(mfrow=c(1,2))
corrplot(pca_matrix_p1_noice, method="circle", title="PC1", mar=c(0,0,1,0))
corrplot(pca_matrix_p2_noice, method="circle", title = "PC2", mar=c(0,0,1,0))

PCA north vs. south separate

dat_pca_N = dat %>% 
  filter(metric %in% var_pca & lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR")) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = "metric", values_from="daynum_fill")

dat_pca_S = dat %>% 
  filter(metric %in% var_pca & lakeid %in% c("FI", "ME", "MO", "WI")) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = "metric", values_from="daynum_fill")

pca_N = prcomp(dat_pca_N[, 3:ncol(dat_pca_N)], scale=T, center=T)
pca_S = prcomp(dat_pca_S[, 3:ncol(dat_pca_S)], scale=T, center=T)

# fviz_eig(pca_noice)
pca_plot_N = fviz_pca_var(pca_N,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE,
             title ="N lakes")    # Avoid text overlapping
pca_plot_S = fviz_pca_var(pca_S,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE,
             title ="S lakes")

gridExtra::grid.arrange(pca_plot_N, pca_plot_S, nrow=1)

Correlation analysis

All lakes

cor_vars = vars_order

dat %>% 
  filter(metric %in% cor_vars) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = metric, values_from = daynum_fill) %>% 
  select(-lakeid, -year) %>% 
  correlate() %>% 
  network_plot(min_cor = 0.2)

North vs. South

cor_vars = vars_order
# "AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"
pN = dat %>% 
  filter(metric %in% cor_vars & lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR")) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = metric, values_from = daynum_fill) %>% 
  select(-lakeid, -year) %>% 
  correlate() %>% 
  network_plot(min_cor = 0.2, colours = c("red", "white", "blue")) +
  ggtitle("Northern Lakes")


pS = dat %>% 
  filter(metric %in% cor_vars & lakeid %in% c("FI", "ME", "MO", "WI")) %>% 
  select(lakeid, year, metric, daynum_fill) %>% 
  pivot_wider(names_from = metric, values_from = daynum_fill) %>% 
  select(-lakeid, -year) %>% 
  correlate() %>% 
  network_plot(min_cor = 0.2, colours = c("red", "white", "blue")) +
  ggtitle("Southern Lakes")

gridExtra::grid.arrange(pN, pS, nrow=1)

(I’m missing code for the rolling window correlation analysis Kait did)

Current Takeaways

  1. Physical events have the most consistent “phenology” across years.
  2. For chemical and biolgical events, eutrophic lakes adhere most strongly to tradition PEG model.
  3. Across all lakes and variables, ice cover explains similarity in phenological event timing and separates the lakes into groups geographically. (any better word than “explains”?)
  4. Ignoring/limiting the effect of ice cover, physical events are still the largest contributor to variation in phenology, but there is some evidence for differences the importance of trophic status and geographical controls (note that these variables are confounded in our dataset).

TODO

  1. Get FI/MO/WI “bad” chlorophyll data files from Mark, use to fill peak chlorophyll dates
  2. Network connectivity inference analysis
  3. Other analyses?
  4. Dive into literature
  5. Finalize figures
  6. JASM talk
  7. Write manuscript